import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, \
OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('Thyroid.csv')
df
| age | sex | on_thyroxine | query_on_thyroxine | on_antithyroid_meds | sick | pregnant | thyroid_surgery | I131_treatment | query_hypothyroid | ... | TT4 | T4U_measured | T4U | FTI_measured | FTI | TBG_measured | TBG | referral_source | target | patient_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 29 | F | f | f | f | f | f | f | f | t | ... | NaN | f | NaN | f | NaN | f | NaN | other | - | 840801013 |
| 1 | 29 | F | f | f | f | f | f | f | f | f | ... | 128.0 | f | NaN | f | NaN | f | NaN | other | - | 840801014 |
| 2 | 41 | F | f | f | f | f | f | f | f | f | ... | NaN | f | NaN | f | NaN | t | 11.0 | other | - | 840801042 |
| 3 | 36 | F | f | f | f | f | f | f | f | f | ... | NaN | f | NaN | f | NaN | t | 26.0 | other | - | 840803046 |
| 4 | 32 | F | f | f | f | f | f | f | f | f | ... | NaN | f | NaN | f | NaN | t | 36.0 | other | S | 840803047 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9167 | 56 | M | f | f | f | f | f | f | f | f | ... | 64.0 | t | 0.83 | t | 77.0 | f | NaN | SVI | - | 870119022 |
| 9168 | 22 | M | f | f | f | f | f | f | f | f | ... | 91.0 | t | 0.92 | t | 99.0 | f | NaN | SVI | - | 870119023 |
| 9169 | 69 | M | f | f | f | f | f | f | f | f | ... | 113.0 | t | 1.27 | t | 89.0 | f | NaN | SVI | I | 870119025 |
| 9170 | 47 | F | f | f | f | f | f | f | f | f | ... | 75.0 | t | 0.85 | t | 88.0 | f | NaN | other | - | 870119027 |
| 9171 | 31 | M | f | f | f | f | f | f | f | t | ... | 66.0 | t | 1.02 | t | 65.0 | f | NaN | other | - | 870119035 |
9172 rows × 31 columns
df.dtypes
age int64 sex object on_thyroxine object query_on_thyroxine object on_antithyroid_meds object sick object pregnant object thyroid_surgery object I131_treatment object query_hypothyroid object query_hyperthyroid object lithium object goitre object tumor object hypopituitary object psych object TSH_measured object TSH float64 T3_measured object T3 float64 TT4_measured object TT4 float64 T4U_measured object T4U float64 FTI_measured object FTI float64 TBG_measured object TBG float64 referral_source object target object patient_id int64 dtype: object
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9172 entries, 0 to 9171 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 9172 non-null int64 1 sex 8865 non-null object 2 on_thyroxine 9172 non-null object 3 query_on_thyroxine 9172 non-null object 4 on_antithyroid_meds 9172 non-null object 5 sick 9172 non-null object 6 pregnant 9172 non-null object 7 thyroid_surgery 9172 non-null object 8 I131_treatment 9172 non-null object 9 query_hypothyroid 9172 non-null object 10 query_hyperthyroid 9172 non-null object 11 lithium 9172 non-null object 12 goitre 9172 non-null object 13 tumor 9172 non-null object 14 hypopituitary 9172 non-null object 15 psych 9172 non-null object 16 TSH_measured 9172 non-null object 17 TSH 8330 non-null float64 18 T3_measured 9172 non-null object 19 T3 6568 non-null float64 20 TT4_measured 9172 non-null object 21 TT4 8730 non-null float64 22 T4U_measured 9172 non-null object 23 T4U 8363 non-null float64 24 FTI_measured 9172 non-null object 25 FTI 8370 non-null float64 26 TBG_measured 9172 non-null object 27 TBG 349 non-null float64 28 referral_source 9172 non-null object 29 target 9172 non-null object 30 patient_id 9172 non-null int64 dtypes: float64(6), int64(2), object(23) memory usage: 2.2+ MB
df.drop("patient_id", axis=1, inplace=True)
columns_to_keep = ['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG', 'target']
# Drop columns except the specified ones
df = df[columns_to_keep]
df
| age | sex | TSH | T3 | TT4 | T4U | FTI | TBG | target | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 29 | F | 0.3 | NaN | NaN | NaN | NaN | NaN | - |
| 1 | 29 | F | 1.6 | 1.9 | 128.0 | NaN | NaN | NaN | - |
| 2 | 41 | F | NaN | NaN | NaN | NaN | NaN | 11.0 | - |
| 3 | 36 | F | NaN | NaN | NaN | NaN | NaN | 26.0 | - |
| 4 | 32 | F | NaN | NaN | NaN | NaN | NaN | 36.0 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9167 | 56 | M | NaN | NaN | 64.0 | 0.83 | 77.0 | NaN | - |
| 9168 | 22 | M | NaN | NaN | 91.0 | 0.92 | 99.0 | NaN | - |
| 9169 | 69 | M | NaN | NaN | 113.0 | 1.27 | 89.0 | NaN | I |
| 9170 | 47 | F | NaN | NaN | 75.0 | 0.85 | 88.0 | NaN | - |
| 9171 | 31 | M | NaN | NaN | 66.0 | 1.02 | 65.0 | NaN | - |
9172 rows × 9 columns
import pandas as pd
# Assuming df is your DataFrame containing the data
# Drop the 'TBG' column
df.drop('TBG', axis=1, inplace=True)
# Remove entries with 'target' column equal to "-"
df = df[df['target'] != "-"]
# Resetting index after dropping rows
df.reset_index(drop=True, inplace=True)
# Display the modified DataFrame
df
| age | sex | TSH | T3 | TT4 | T4U | FTI | target | |
|---|---|---|---|---|---|---|---|---|
| 0 | 32 | F | NaN | NaN | NaN | NaN | NaN | S |
| 1 | 63 | F | 68.00 | NaN | 48.0 | 1.02 | 47.0 | F |
| 2 | 75 | F | 0.05 | 1.6 | 157.0 | 0.89 | 176.0 | AK |
| 3 | 41 | M | 0.05 | 1.6 | 39.0 | 1.00 | 39.0 | R |
| 4 | 71 | F | 0.05 | NaN | 126.0 | 1.38 | 91.0 | I |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2396 | 64 | M | 0.81 | NaN | 31.0 | 0.55 | 56.0 | K |
| 2397 | 60 | M | 0.18 | NaN | 28.0 | 0.87 | 32.0 | K |
| 2398 | 64 | M | NaN | NaN | 44.0 | 0.53 | 83.0 | J |
| 2399 | 36 | F | NaN | NaN | 84.0 | 1.26 | 67.0 | I |
| 2400 | 69 | M | NaN | NaN | 113.0 | 1.27 | 89.0 | I |
2401 rows × 8 columns
import pandas as pd
# Display count of rows before removing null values
print("Number of rows before removing null values:", df.shape[0])
# Remove null values
df = df.dropna()
# Display count of rows after removing null values
print("Number of rows after removing null values:", df.shape[0])
Number of rows before removing null values: 2401 Number of rows after removing null values: 1621
print(df.isnull().sum())
age 0 sex 0 TSH 0 T3 0 TT4 0 T4U 0 FTI 0 target 0 dtype: int64
# List of valid target subdiseases
valid_subdiseases = ['A', 'B', 'F', 'G']
# Display row count before dropping
print("Row count before dropping:", len(df))
# Drop rows where 'target' column does not contain valid subdiseases
df = df[df['target'].isin(valid_subdiseases)]
# Display row count after dropping
print("Row count after dropping:", len(df))
Row count before dropping: 1621 Row count after dropping: 484
# Display count of each target class
target_counts = df['target'].value_counts()
print("Count of each target class:")
print(target_counts)
Count of each target class: target G 211 F 164 A 93 B 16 Name: count, dtype: int64
df.duplicated().sum()
0
target_counts = df['target'].value_counts()
fig = px.bar(x=target_counts.index, y=target_counts.values)
fig.update_layout(xaxis_title='Target', yaxis_title='Count', title='Distribution of Target')
fig.show()
df['sex'].replace({'M': 0, 'F': 1}, inplace=True)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
df
| age | sex | TSH | T3 | TT4 | T4U | FTI | target | |
|---|---|---|---|---|---|---|---|---|
| 9 | 61 | 0 | 9.799999 | 1.2 | 114.0 | 0.84 | 136.0 | G |
| 10 | 27 | 1 | 90.000000 | 0.4 | 7.5 | 0.94 | 7.5 | F |
| 32 | 40 | 1 | 70.000000 | 0.4 | 3.9 | 0.83 | 5.0 | F |
| 40 | 57 | 0 | 0.250000 | 4.2 | 236.0 | 0.70 | 337.0 | A |
| 44 | 44 | 1 | 8.400000 | 1.8 | 108.0 | 1.01 | 107.0 | G |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2360 | 71 | 1 | 23.000000 | 1.8 | 87.0 | 0.96 | 91.0 | G |
| 2369 | 51 | 1 | 106.000000 | 0.6 | 5.0 | 0.89 | 5.5 | F |
| 2372 | 66 | 1 | 85.000000 | 1.8 | 118.0 | 1.27 | 93.0 | G |
| 2391 | 75 | 1 | 17.000000 | 1.4 | 104.0 | 1.15 | 90.0 | G |
| 2392 | 74 | 1 | 53.000000 | 1.0 | 49.0 | 1.25 | 39.0 | F |
484 rows × 8 columns
print(df.isnull().sum())
age 0 sex 0 TSH 0 T3 0 TT4 0 T4U 0 FTI 0 target 0 dtype: int64
df.dtypes
age int64 sex int64 TSH float64 T3 float64 TT4 float64 T4U float64 FTI float64 target object dtype: object
# Convert the 'target' column to numerical values
df['target'] = df['target'].astype('category').cat.codes
df
| age | sex | TSH | T3 | TT4 | T4U | FTI | target | |
|---|---|---|---|---|---|---|---|---|
| 9 | 61 | 0 | 9.799999 | 1.2 | 114.0 | 0.84 | 136.0 | 3 |
| 10 | 27 | 1 | 90.000000 | 0.4 | 7.5 | 0.94 | 7.5 | 2 |
| 32 | 40 | 1 | 70.000000 | 0.4 | 3.9 | 0.83 | 5.0 | 2 |
| 40 | 57 | 0 | 0.250000 | 4.2 | 236.0 | 0.70 | 337.0 | 0 |
| 44 | 44 | 1 | 8.400000 | 1.8 | 108.0 | 1.01 | 107.0 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2360 | 71 | 1 | 23.000000 | 1.8 | 87.0 | 0.96 | 91.0 | 3 |
| 2369 | 51 | 1 | 106.000000 | 0.6 | 5.0 | 0.89 | 5.5 | 2 |
| 2372 | 66 | 1 | 85.000000 | 1.8 | 118.0 | 1.27 | 93.0 | 3 |
| 2391 | 75 | 1 | 17.000000 | 1.4 | 104.0 | 1.15 | 90.0 | 3 |
| 2392 | 74 | 1 | 53.000000 | 1.0 | 49.0 | 1.25 | 39.0 | 2 |
484 rows × 8 columns
df.dtypes
age int64 sex int64 TSH float64 T3 float64 TT4 float64 T4U float64 FTI float64 target int8 dtype: object
cols_to_drop = ['sex', 'target']
data_subset = df.drop(cols_to_drop, axis=1)
# Plot boxplots
plt.figure(figsize=(12, 8))
sns.boxplot(data=data_subset)
plt.title('Distribution of Numerical Features')
plt.show()
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
data_cleaned = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
print(data_cleaned.describe())
age sex TSH T3 TT4 T4U \
count 240.000000 240.0 240.000000 240.000000 240.000000 240.000000
mean 55.920833 1.0 19.507875 1.696667 77.457500 1.010792
std 17.894565 0.0 16.719436 0.697507 34.421876 0.132440
min 7.000000 1.0 0.020000 0.200000 2.000000 0.670000
25% 42.750000 1.0 7.875000 1.375000 58.000000 0.920000
50% 59.000000 1.0 12.000000 1.700000 80.000000 1.000000
75% 71.000000 1.0 28.000000 2.100000 99.000000 1.090000
max 88.000000 1.0 77.000000 4.300000 147.000000 1.360000
FTI target
count 240.000000 240.000000
mean 77.015833 2.620833
std 32.756725 0.565744
min 2.000000 1.000000
25% 60.000000 2.000000
50% 80.000000 3.000000
75% 100.000000 3.000000
max 139.000000 3.000000
df.hist(figsize=(12, 10), bins=20)
plt.suptitle('Histogram of Numerical Features')
plt.show()
# Plot pairplot to visualize the distribution and relationships between features
sns.pairplot(data_cleaned, diag_kind='kde', hue='target')
plt.suptitle('Pairplot of Features', y=1.02)
plt.show()
# Plot boxplots to visualize the distribution and identify outliers
cols_to_drop = ['sex', 'target']
data_subset = data_cleaned.drop(cols_to_drop, axis=1)
plt.figure(figsize=(12, 8))
sns.boxplot(data=data_subset)
plt.title('Boxplot of Numerical Features')
plt.show()
target_counts = df['target'].value_counts(normalize=True) * 100
# Print the results
print("Percentage of individuals based on the 'target' column (thyroid dataset):")
for target, percentage in target_counts.items():
print(f"Target {target}: {percentage:.2f}%")
Percentage of individuals based on the 'target' column (thyroid dataset): Target 3: 43.60% Target 2: 33.88% Target 0: 19.21% Target 1: 3.31%
import seaborn as sns
# Define the list of numerical variables in your dataset
numerical_vars = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']
plt.figure(figsize=(10, 8))
sns.heatmap(data_cleaned[numerical_vars].corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()
# Feature Engineering: Create a new feature representing the ratio of T3 to TT4
data_cleaned['T3_TT4_ratio'] = data_cleaned['T3'] / data_cleaned['TT4']
# Feature Selection: Calculate the correlation between features and the target variable
correlation = data_cleaned.corr()['target'].abs().sort_values(ascending=False)
# Select the top k most correlated features
k = 5 # Number of features to select
selected_features = correlation[1:k+1].index.tolist() # Exclude the target column
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Load your cleaned data into a DataFrame named data_cleaned
# Feature Engineering
data_cleaned['T3_TT4_ratio'] = data_cleaned['T3'] / data_cleaned['TT4']
# Feature Selection
correlation = data_cleaned.corr()['target'].abs().sort_values(ascending=False)
k = 5 # Number of features to select
selected_features = correlation[1:k+1].index.tolist() # Exclude the target column
#Split Dataset into Training and Testing Sets
from sklearn.model_selection import train_test_split
X = data_cleaned.drop('target', axis=1)
y = data_cleaned['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Initialize models
models = {
'Logistic Regression': LogisticRegression(),
'SVM': SVC(),
'Random Forest': RandomForestClassifier()
}
from sklearn.metrics import precision_score, recall_score, f1_score
# Calculate metrics for each model
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted') # Specify average='weighted' for multiclass
recall = recall_score(y_test, y_pred, average='weighted') # Specify average='weighted' for multiclass
f1 = f1_score(y_test, y_pred, average='weighted') # Specify average='weighted' for multiclass
results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}
# Display results
import pandas as pd
results_df = pd.DataFrame(results).T
print(results_df)
Accuracy Precision Recall F1 Score Logistic Regression 0.979167 0.980655 0.979167 0.979392 SVM 0.958333 0.919048 0.958333 0.938113 Random Forest 1.000000 1.000000 1.000000 1.000000
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Ensure your data is correctly loaded and cleaned
# Example: data_cleaned = pd.read_csv('path_to_your_data.csv')
# Define the selected features and target variable, including the engineered feature
selected_features = ['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'T3_TT4_ratio']
X = data_cleaned[selected_features]
y = data_cleaned['target']
# Check for missing values
if X.isnull().any().any() or y.isnull().any():
raise ValueError("Data contains missing values. Please handle them before proceeding.")
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Create and train the logistic regression model
logistic_model = LogisticRegression(C=1.0, solver='liblinear', random_state=42)
logistic_model.fit(X_train_scaled, y_train)
# Predict on the test set
y_pred = logistic_model.predict(X_test_scaled)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# Print the evaluation metrics
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
Model Evaluation Metrics: Accuracy: 0.9583 Precision: 0.9598 Recall: 0.9583 F1 Score: 0.9552
from sklearn.metrics import accuracy_score
# Assuming models is a dictionary containing model objects
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{name} Accuracy: {accuracy:.2f}")
Logistic Regression Accuracy: 0.98 SVM Accuracy: 0.96 Random Forest Accuracy: 1.00
# Dictionary to store accuracies
accuracies = {}
# Iterate through each model
for name, model in models.items():
# Fit the model on the training data
model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Store accuracy in the dictionary
accuracies[name] = accuracy
# Plotting
plt.figure(figsize=(10, 6))
plt.barh(list(accuracies.keys()), list(accuracies.values()), color='skyblue')
plt.xlabel('Accuracy')
plt.title('Accuracy of Different Models')
plt.xlim(0, 1)
plt.show()
import pickle
# Assuming you have a model named 'model' that you want to save
filename = 'thyroid.sav'
pickle.dump(logistic_model, open(filename, 'wb'))
df
| age | sex | TSH | T3 | TT4 | T4U | FTI | target | |
|---|---|---|---|---|---|---|---|---|
| 9 | 61 | 0 | 9.799999 | 1.2 | 114.0 | 0.84 | 136.0 | 3 |
| 10 | 27 | 1 | 90.000000 | 0.4 | 7.5 | 0.94 | 7.5 | 2 |
| 32 | 40 | 1 | 70.000000 | 0.4 | 3.9 | 0.83 | 5.0 | 2 |
| 40 | 57 | 0 | 0.250000 | 4.2 | 236.0 | 0.70 | 337.0 | 0 |
| 44 | 44 | 1 | 8.400000 | 1.8 | 108.0 | 1.01 | 107.0 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2360 | 71 | 1 | 23.000000 | 1.8 | 87.0 | 0.96 | 91.0 | 3 |
| 2369 | 51 | 1 | 106.000000 | 0.6 | 5.0 | 0.89 | 5.5 | 2 |
| 2372 | 66 | 1 | 85.000000 | 1.8 | 118.0 | 1.27 | 93.0 | 3 |
| 2391 | 75 | 1 | 17.000000 | 1.4 | 104.0 | 1.15 | 90.0 | 3 |
| 2392 | 74 | 1 | 53.000000 | 1.0 | 49.0 | 1.25 | 39.0 | 2 |
484 rows × 8 columns